seqlength = 4096
batchsize = 1
accumulate_steps = 44
train_tokens = 100000000
theoryflops = 256000000000000.0
epochs = 1
flashattn = False
recompute = False
tensor_parallel = 8
pipeline_parallel = 10
